{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "7e4c9601-fcb8-49e6-ae48-4de8b6cac494",
   "metadata": {
    "id": "7e4c9601-fcb8-49e6-ae48-4de8b6cac494"
   },
   "source": [
    "<div>\n",
    "    <h1 style=\"text-align: center;\">Large Language Models Projects</a></h1>\n",
    "    <h3>Apply and Implement Strategies for Large Language Models</h3>\n",
    "    <h2>2_2-ChromaDB Server mode</h2>\n",
    "    \n",
    "</div>\n",
    "\n",
    "by [Pere Martra](https://www.linkedin.com/in/pere-martra/)\n",
    "_________\n",
    "This notebook is set up to run in a local environment. If executed in Google Colab, access to the ChromaDB server that will be created won't be possible.\n",
    "\n",
    "The code is the same as the one used in notebook 2_1_Vector_Databases_LLMs.ipynb.\n",
    "The client has been implemented in the notebook 2_3-ChromaDB Client.\n",
    "__________"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fbdcd16f-1aca-442b-8d84-3fafdba82dd2",
   "metadata": {
    "id": "fbdcd16f-1aca-442b-8d84-3fafdba82dd2",
    "outputId": "15a5c4e0-ba39-41f0-f9fc-7245b1f98919"
   },
   "outputs": [],
   "source": [
    "!pip install chromadb==0.4.20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6c765af-dc87-41e3-891d-a6e934a53259",
   "metadata": {
    "id": "d6c765af-dc87-41e3-891d-a6e934a53259"
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8be6c63a-5077-4550-98f7-6e3e118872a5",
   "metadata": {
    "id": "8be6c63a-5077-4550-98f7-6e3e118872a5"
   },
   "outputs": [],
   "source": [
    "#download and unzip  the dataset from kaggle:\n",
    "#https://www.kaggle.com/datasets/kotartemiy/topic-labeled-news-dataset\n",
    "\n",
    "#Pass the directory where the .csv file is stored to read_csv\n",
    "news = pd.read_csv('./kaggle/labelled_newscatcher_dataset.csv', sep=';')\n",
    "MAX_NEWS = 1000\n",
    "DOCUMENT=\"title\"\n",
    "TOPIC=\"topic\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a2fd5a3-29a4-4d5b-9527-fd431671ed93",
   "metadata": {
    "id": "6a2fd5a3-29a4-4d5b-9527-fd431671ed93"
   },
   "outputs": [],
   "source": [
    "#Because it is just a example we select a small portion of News.\n",
    "subset_news = news.head(MAX_NEWS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e7fc76a-c828-4db4-bada-b1e14c2eefea",
   "metadata": {
    "id": "5e7fc76a-c828-4db4-bada-b1e14c2eefea"
   },
   "outputs": [],
   "source": [
    "import chromadb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "04b25934-62bd-4720-871a-7e516807c3e7",
   "metadata": {
    "id": "04b25934-62bd-4720-871a-7e516807c3e7"
   },
   "outputs": [],
   "source": [
    "chroma_client = chromadb.PersistentClient(path=\"./chromadb\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06549d87-b66f-44b7-a1dc-b6c122685602",
   "metadata": {
    "id": "06549d87-b66f-44b7-a1dc-b6c122685602"
   },
   "outputs": [],
   "source": [
    "collection_name = \"local_news_collection\"\n",
    "if len(chroma_client.list_collections()) > 0 and collection_name in [chroma_client.list_collections()[0].name]:\n",
    "        chroma_client.delete_collection(name=collection_name)\n",
    "\n",
    "collection = chroma_client.create_collection(name=collection_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8b6d5c4-0f6c-4bfd-9bab-6f0df0fa2469",
   "metadata": {
    "id": "f8b6d5c4-0f6c-4bfd-9bab-6f0df0fa2469",
    "outputId": "a281dc8b-4d31-4ecf-a50a-95cec127d7d6"
   },
   "outputs": [],
   "source": [
    "collection.add(\n",
    "    documents=subset_news[DOCUMENT].tolist(),\n",
    "    metadatas=[{TOPIC: topic} for topic in subset_news[TOPIC].tolist()],\n",
    "    ids=[f\"id{x}\" for x in range(MAX_NEWS)],\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c9211e7-bd14-4cd0-b617-6b520de23e1b",
   "metadata": {
    "id": "9c9211e7-bd14-4cd0-b617-6b520de23e1b",
    "outputId": "2358d16f-6269-4aa9-de1a-5f6f4c15134d"
   },
   "outputs": [],
   "source": [
    "results = collection.query(query_texts=[\"laptop\"], n_results=10 )\n",
    "\n",
    "print(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65a107f7-efbb-4a4a-8fe4-4137f00d925f",
   "metadata": {
    "id": "65a107f7-efbb-4a4a-8fe4-4137f00d925f",
    "outputId": "fdc6fa12-0122-45a1-e3d6-07c23e8f994d"
   },
   "outputs": [],
   "source": [
    "#Running Chroma in Server Mode\n",
    "!chroma run --path ./chromadb"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bcb943bf-a5b2-4bb3-a265-72e2633ab98a",
   "metadata": {
    "id": "bcb943bf-a5b2-4bb3-a265-72e2633ab98a"
   },
   "source": [
    "You have the code to test this server in the notebook 2_3-ChromaDB Client.ipynb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9428dd9-39dd-4d96-9bdb-05b6c00cfe67",
   "metadata": {
    "id": "a9428dd9-39dd-4d96-9bdb-05b6c00cfe67"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
